# Read updated dataset
bmw_data <- read.csv("/Users/jieminyang/Documents/08.BU/Academics/MA575/Project/Projects/BMWpricing_updated.csv", header=TRUE, as.is=TRUE)
# create summary
summary(bmw_data)
## maker_key model_key mileage engine_power
## Length:4843 Length:4843 Min. : -64 Min. : 0
## Class :character Class :character 1st Qu.: 102914 1st Qu.:100
## Mode :character Mode :character Median : 141080 Median :120
## Mean : 140963 Mean :129
## 3rd Qu.: 175196 3rd Qu.:135
## Max. :1000376 Max. :423
## registration_date fuel paint_color car_type
## Length:4843 Length:4843 Length:4843 Length:4843
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## feature_1 feature_2 feature_3 feature_4
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2181 FALSE:1004 FALSE:3865 FALSE:3881
## TRUE :2662 TRUE :3839 TRUE :978 TRUE :962
##
##
##
## feature_5 feature_6 feature_7 feature_8
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2613 FALSE:3674 FALSE:329 FALSE:2223
## TRUE :2230 TRUE :1169 TRUE :4514 TRUE :2620
##
##
##
## price sold_at obs_type
## Min. : 100 Length:4843 Length:4843
## 1st Qu.: 10800 Class :character Class :character
## Median : 14200 Mode :character Mode :character
## Mean : 15828
## 3rd Qu.: 18600
## Max. :178500
# Create the Age Variable (year sold - year register)
sold_at_split <- strsplit(bmw_data$sold_at, "/")
registration_split <- strsplit(bmw_data$registration_date, "/")
bmw_data$month_sold <- sapply(sold_at_split, function(x) as.integer(x[1]))
bmw_data$year_sold <- sapply(sold_at_split, function(x) as.integer(x[3]))
bmw_data$month_registered <- sapply(registration_split, function(x) as.integer(x[1]))
bmw_data$year_registered <- sapply(registration_split, function(x) as.integer(x[3]))
price <- bmw_data$price # our y variable
bmw_data$age <- bmw_data$year_sold-bmw_data$year_registered + (1/12)*(bmw_data$month_sold - bmw_data$month_registered) # our x variable
age <- bmw_data$age
length(price)
## [1] 4843
length(age)
## [1] 4843
#prepare predictors and response
response <- "price"
predictors <- c("mileage", "fuel", "paint_color", "car_type", "feature_1", "feature_2", "feature_3", "feature_4", "feature_5", "feature_6", "feature_7", "feature_8", "year_registered", "month_registered")
bmw_subset <- subset(bmw_data, select = c(response, predictors))
To get the idea of how each predictors are distributed, we are creating pairwise plots to visualize one on one correlation with car price and the distribution of the predictors.
for (predictor in predictors) {
# Subset the dataframe for the current predictor variable
plot_data <- subset(bmw_subset, select = c(response, predictor))
# Create a scatterplot/correlation graph
g <- ggpairs(plot_data,
title = paste("Scatterplot and correlation for price and", predictor),
upper = list(continuous = wrap("points", alpha=0.3, size=0.1)),
lower = list(continuous = "cor", method = "spearman")) + theme(axis.text = element_text(size = 6))
# Print the graph
print(g)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We decide to use Age as the predictor of our simple linear regression. It shows car price and car age are right skewed, with few extreme values at the tail.
# check the distribution of 2 variables
scatterplot(log(age), log(price),
ylab="Price Sold in $", xlab="Age of Car",
pch=19, cex=0.2)
# boxplot shows both variable is not normally distributed, scatterplot detects extreme outliers
scatterplotMatrix( ~ price + age, pch=19, cex=0.2)
Our first model is simply using age as x variable and price as y variable. We set this as the benchmark predicability. R^2 is 0.1987 which means approximately 19.87% of variations in price is explained by the model. The NQQ plot shows great deviation from normal quantile, indicating data on the upper tail is highly skewed. A U-shape pattern is identified in the SR plot, indicating non-constant variance. Several points with high leverage is observed, but no potential bad leverage point detected, since all leverage points lies in the Cook distance. The model need improvements on the above mentioned issues.
m1 <- lm(price~age)
summary(m1)
##
## Call:
## lm(formula = price ~ age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19229 -4763 -1643 2408 162647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24608.65 280.26 87.81 <2e-16 ***
## age -1616.40 46.74 -34.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8258 on 4841 degrees of freedom
## Multiple R-squared: 0.1981, Adjusted R-squared: 0.1979
## F-statistic: 1196 on 1 and 4841 DF, p-value: < 2.2e-16
scatterplot(age, price,
ylab="Price Sold in $", xlab="Age of Car",
pch=19, cex=0.2)
abline(m1, col = 'red')
summary(m1)$r.squared
## [1] 0.1981027
par(mfrow = c(2,2))
plot(m1, cex = 1, pch = 5)
Cheking the outlier Here we use leverage score and z-score of residual to find potential outliers and bad leverage points, and filter them out. After cleaning, the clustering problem in the data is solved (shown on the next graph), and the not skewed distributed variables become nearly normal. We lost 411 data points through cleaning.
# make a new dataframe for cleaning outlier
model_data <- data.frame(age = age, price = bmw_data$price)
# Use leverage to check the outlier on age of cars
lev <- hatvalues(m1)
model_data$filter1 <- lev <= (4/length(age))
# use z-score for residuals to check the outliers for age of cars
resid = residuals(m1)
z_resid = (resid - mean(resid))/sd(resid)
model_data$filter2 <- z_resid > (-3) & z_resid < 3
cleaned_data <- model_data[model_data$filter1 != FALSE & model_data$filter2 != FALSE, ]
cleaned_data <- cleaned_data[, -3:-4]
summary(cleaned_data) # almost normally distributed
## age price
## Min. :1.167 Min. : 100
## 1st Qu.:4.000 1st Qu.:11400
## Median :4.750 Median :14500
## Mean :4.909 Mean :15951
## 3rd Qu.:5.583 3rd Qu.:18800
## Max. :9.750 Max. :44600
summary(cleaned_data$age)# almost normally distributed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.167 4.000 4.750 4.909 5.583 9.750
summary(cleaned_data$price)# almost normally distributed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 11400 14500 15951 18800 44600
# plot the cleaned data again
scatterplot(cleaned_data$age, cleaned_data$price,
ylab="Price Sold in $", xlab="Age of Car",
pch=19, cex=0.2)
length(cleaned_data$age)-length(age)
## [1] -411
Our second experiment model is age vs price after cleaning. The r^2 is 0.1273, which is significantly lower than the model without cleaning. There is still large deviation on both tails from quantile of normal distribution on the NQQ plot. SR and leverage points has much improved.
# use the correlation matrix plot to check linear association and distribution
ggpairs(cleaned_data,
upper=list(continuous=wrap("points", alpha=0.3, size=0.1)),
lower=list(continuous=wrap('cor', size=4)))
# age normally distributed
# price right skewed, need log transformation
# some negative linear association as r = -0.357
m2 <- lm(cleaned_data$price~cleaned_data$age)
summary(m2)
##
## Call:
## lm(formula = cleaned_data$price ~ cleaned_data$age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19000 -4303 -1309 2882 25236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25055.89 371.61 67.43 <2e-16 ***
## cleaned_data$age -1854.83 72.96 -25.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6596 on 4430 degrees of freedom
## Multiple R-squared: 0.1273, Adjusted R-squared: 0.1271
## F-statistic: 646.3 on 1 and 4430 DF, p-value: < 2.2e-16
plot(cleaned_data$age, cleaned_data$price, col = rgb(0,0,0, alpha = 0.5), cex = 0.1)
abline(m2, col = 'red')
summary(m2)$r.squared
## [1] 0.1273159
par(mfrow = c(2,2))
plot(m2, cex = 1, pch = 5)
## Model 3 After taking log on the y variable, price, deviation from
normal became worse on the NQQ plot. Patterns in the scatterplot has
much improved, with clear linearity observed. Patterns in the SR plot
has improved as well, and number of high leverage point is greatly
reduced. R^2 at 0.3735 shows improvements in the predicability.
# now apply log transformation to y and check the predictability of the model
logprice = log(price)
m3 <- lm(logprice~age)
plot(age, logprice, col = rgb(0,0,0, alpha = 0.5), cex = 0.1)
abline(m3, col = 'red')
summary(m3)$r.squared
## [1] 0.373509
par(mfrow = c(2,2))
plot(m3, cex = 1, pch = 5)
## Model 4 After doing log transform on age on the cleaned data, the
above identified issue didn’t improved much. Improvements shows in NQQ
plot, with deviation only shown in the lower tail, and upper tail become
approximately normal. Patterns in the SR plot has become worse, few
points with extreme negative SR shows in the bottom half, and most point
has positive SR, which is a position we don’t want.The R^2 at 0.0831
comfirm our observation that the predicability of model has been
reduced. We would potentially drop the cleaned data.
# now apply log transformation to cleaned dataset on y and check the predictability of the model
c_logprice = log(cleaned_data$price)
c_age = cleaned_data$age
m4 <- lm(c_logprice~c_age)
plot(c_age, c_logprice, col = rgb(0,0,0, alpha = 0.5), cex = 0.1)
abline(m4, col = 'red')
summary(m4)$r.squared
## [1] 0.08310191
par(mfrow = c(2,2))
plot(m4, cex = 1, pch = 5)
# cleaned data has significanly lower r-squared value, considering not using the cleaned data
Some patterns shows in the residual plot, but overall is pretty good (mean residual near 0, and little pattern is observed). Non-linear pattern oberved on the scatterplot. Normal QQ plot shows improvement in the middle part of the data, and the tail and bottoms has more deviations than before. Maybe because of the increase deviation on the tails, R^2 has been reduced to 0.3122 compared to the model with log transformation only on y.
# has seen worsen prediction power in the cleaned data... now get back to the original data
# apply log transformation to both x and y and test the model
logprice = log(price)
logage = log(age)
m5 <- lm(logprice~logage)
plot(logage, logprice, col = rgb(0,0,0, alpha = 0.5), cex = 0.1)
abline(m5, col = 'red')
summary(m5)$r.squared
## [1] 0.3122484
par(mfrow = c(2,2))
plot(m5, cex = 1, pch = 5)
## Model 6 Then we tried take 1/4 root of price, this model is okay with
R^2 at 0.3545. But there is pattern in the SR plot, which is not an
ideal model because constant variance was violated.
rootprice = price^0.25
age = age
m6 <- lm(rootprice~age)
plot(age, rootprice, col = rgb(0,0,0, alpha = 0.5), cex = 0.1)
abline(m6, col = 'red')
summary(m6)$r.squared
## [1] 0.3544822
par(mfrow = c(2,2))
plot(m6, cex = 1, pch = 5)
## Final Model As discussed above, doing log transformation only on y is
the best choice for our simple linear regression. Choose this as our
regression model for further analysis.
both x is highly right skewed and logy has slightly trend of left skewness. We may address it in further analysis in the next deliverable.
bmw_model = data.frame(age, logprice)
plot(density(age))
abline(v = mean(age), col ='red')
abline(v = median(age), col = 'blue')
abline(v = mean(age)-3*sd(age), col ='purple')
abline(v = mean(age)+3*sd(age), col ='purple')
plot(density(log(price)))
abline(v = mean(logprice), col ='red')
abline(v = median(logprice), col = 'blue')
abline(v = mean(logprice)-3*sd(logprice), col ='purple')
abline(v = mean(logprice)+3*sd(logprice), col ='purple')
summary(m3)
##
## Call:
## lm(formula = logprice ~ age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2298 -0.2420 -0.0005 0.2664 2.5834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.356672 0.017468 592.91 <2e-16 ***
## age -0.156505 0.002913 -53.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5147 on 4841 degrees of freedom
## Multiple R-squared: 0.3735, Adjusted R-squared: 0.3734
## F-statistic: 2886 on 1 and 4841 DF, p-value: < 2.2e-16
anova(m3)
The coefficient is: T value is: P value is: